GP regression

# load required libraries
library(gplite)
## This is gplite version 0.13.0
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Change the number of restarts on gp_optim

Generate data for the function to which the model will be fit.

n <- 5
x  <- matrix(seq(-2, 2, length.out = n), ncol = 1)
y  <- x^2

Plot the data.

ggplot() + 
  geom_point(aes(x=x, y=y), size=2) +
  xlab('x') + ylab('y') 

Initialize the GP model.

# Specify the GP model we want to use:
gp_empty <- gp_init(
  
  # A squared exponential (aka Gaussian aka RBF) kernel
  cfs = cf_sexp(
    vars = NULL,
    lscale = 0.3,
    magn = 1,
    prior_lscale = prior_logunif(),
    prior_magn = prior_logunif(),
    normalize = FALSE
  ),  
  
  # Assume Gaussian distributed errors
  lik = lik_gaussian(
    sigma = 0.5, 
    prior_sigma = prior_logunif()
  ), 
  
  # Use the full covariance (i.e., do not approximate)
  method = method_full() 
  
)

Optimize the model using the internal function.

# Now fit the model to the data:
gp_optimized <- gp_optim(gp_empty, x, y, verbose = FALSE)

Plot the model.

# compute the predictive mean and variance in a grid of points
xt   <- seq(-4, 4, len=150)
pred <- gp_pred(gp_optimized, xt, var = T)

# visualize
mu <- pred$mean
lb <- pred$mean - 2*sqrt(pred$var)
ub <- pred$mean + 2*sqrt(pred$var)

ggplot() + 
  geom_ribbon(aes(x=xt, ymin=lb, ymax=ub), fill='lightgray') +
  geom_line(aes(x=xt, y=mu), linewidth = 0.5) +
  geom_point(aes(x=x, y=y), size=2) +
  xlab('x') + ylab('y')

Pull the energy.

gp_energy(gp_optimized)
## [1] 11.887

Increase the number of restarts.

# Now fit the model to the data:
gp_optimized_10 <- gp_optim(
  
  gp = gp_empty, 
  x = x, 
  y = y, 
  restarts = 10,
  verbose = FALSE
  
)

Plot the model with larger number of restarts.

# compute the predictive mean and variance in a grid of points
pred <- gp_pred(gp_optimized_10, xt, var = T)

# visualize
mu <- pred$mean
lb <- pred$mean - 2*sqrt(pred$var)
ub <- pred$mean + 2*sqrt(pred$var)

ggplot() + 
  geom_ribbon(aes(x=xt, ymin=lb, ymax=ub), fill='lightgray') +
  geom_line(aes(x=xt, y=mu), linewidth = 0.5) +
  geom_point(aes(x=x, y=y), size=2) +
  xlab('x') + ylab('y')

Pull the energy.

gp_energy(gp_optimized_10)
## [1] 11.887

Increase to 1000 restarts.

# Now fit the model to the data:
gp_optimized_1000 <- gp_optim(
  
  gp = gp_empty, 
  x = x, 
  y = y, 
  restarts = 1000,
  verbose = FALSE
  
)

Plot the model with larger number of restarts.

# compute the predictive mean and variance in a grid of points
pred <- gp_pred(gp_optimized_1000, xt, var = T)

# visualize
mu <- pred$mean
lb <- pred$mean - 2*sqrt(pred$var)
ub <- pred$mean + 2*sqrt(pred$var)

ggplot() + 
  geom_ribbon(aes(x=xt, ymin=lb, ymax=ub), fill='lightgray') +
  geom_line(aes(x=xt, y=mu), linewidth = 0.5) +
  geom_point(aes(x=x, y=y), size=2) +
  xlab('x') + ylab('y')

Pull the energy.

gp_energy(gp_optimized_1000)
## [1] 11.887

Increase to 1,000,000 restarts.

# Now fit the model to the data:
gp_optimized_1e6 <- gp_optim(
  
  gp = gp_empty, 
  x = x, 
  y = y, 
  restarts = 1e6,
  verbose = FALSE
  
)

Plot the model with larger number of restarts.

# compute the predictive mean and variance in a grid of points
pred <- gp_pred(gp_optimized_1e6, xt, var = T)

# visualize
mu <- pred$mean
lb <- pred$mean - 2*sqrt(pred$var)
ub <- pred$mean + 2*sqrt(pred$var)

ggplot() + 
  geom_ribbon(aes(x=xt, ymin=lb, ymax=ub), fill='lightgray') +
  geom_line(aes(x=xt, y=mu), linewidth = 0.5) +
  geom_point(aes(x=x, y=y), size=2) +
  xlab('x') + ylab('y')

Pull the energy.

gp_energy(gp_optimized_1e6)
## [1] 11.887

Plot the marginal likelihood surface with more points

Recapitulate the original surface plot.

ell <- seq(0.5, 3, by = 0.05)
  
sigma_f <- seq(2, 8, by = 0.05)

hyperparams <- expand.grid(
  ell = ell,
  sigma_f = sigma_f
)

# collect energies in the empty vector
energy <- c()

# run 6000+ models
for (i in 1:dim(hyperparams)[1]) {
  
  gp_empty <- gp_init(
    # A squared exponential (aka Gaussian aka RBF) kernel
    cfs = cf_sexp(
      vars = NULL,
      lscale = hyperparams$ell[i],
      magn = hyperparams$sigma_f[i],
      prior_lscale = prior_logunif(),
      prior_magn = prior_logunif(),
      normalize = FALSE
    ),  
    
    # Assume Gaussian distributed errors
    lik = lik_gaussian(
      sigma = 0.01, 
      prior_sigma = prior_logunif()
    ), 
    
    # Use the full covariance (i.e., do not approximate)
    method = method_full() 
  )
  
  gp_raw <- gp_fit(gp_empty, x, y)
  
  energy[i] <- gp_energy(gp_raw)
  
}

surface_plot <- cbind(hyperparams, energy)

surface_plot[which.min(surface_plot$energy),]
##       ell sigma_f   energy
## 4212 1.95     6.1 11.05812

Model with \(\ell = 2\) and \(\sigma_f^2 = 6\), near the optimal values. Recall that we are setting \(\sigma_n^2 = 0.01\).

# Specify the GP model we want to use,
# we will specify the lscale and magn using values from above
gp_empty <- gp_init(
  # A squared exponential (aka Gaussian aka RBF) kernel
  cfs = cf_sexp(
    vars = NULL,
    lscale = 2,
    magn = 6,
    prior_lscale = prior_logunif(),
    prior_magn = prior_logunif(),
    normalize = FALSE
  ),  
  
  # Assume Gaussian distributed errors
  lik = lik_gaussian(
    sigma = 0.01, 
    prior_sigma = prior_logunif()
  ), 
  
  # Use the full covariance (i.e., do not approximate)
  method = method_full() 
)

gp_raw_ell2sigma6 <- gp_fit(gp_empty, x, y)

Plot the model.

# compute the predictive mean and variance in a grid of points
pred_ell2sigma6 <- gp_pred(gp_raw_ell2sigma6, xt, var = T)

# visualize
mu_ell2sigma6 <- pred_ell2sigma6$mean
lb_ell2sigma6 <- pred_ell2sigma6$mean - 2*sqrt(pred_ell2sigma6$var)
ub_ell2sigma6 <- pred_ell2sigma6$mean + 2*sqrt(pred_ell2sigma6$var)

ggplot() + 
  geom_ribbon(aes(x=xt, ymin=lb_ell2sigma6, ymax=ub_ell2sigma6), fill='lightgray') +
  geom_line(aes(x=xt, y=mu_ell2sigma6), linewidth = 0.5) +
  geom_point(aes(x=x, y=y), size=2) +
  xlab('x') + ylab('y')

Plot the entire surface.

fig <- plot_ly() %>% 
  
  add_trace(
    x = unique(surface_plot$sigma_f), 
    y = unique(surface_plot$ell), 
    z = matrix(surface_plot$energy, nrow = 51, ncol = 121),
    type = "surface") %>% 
  
  add_trace(
    x = 6.1,
    y = 1.95,
    z = 11.05812,
    type = "scatter3d",
    mode = "markers"
  ) %>%
  
  layout(
    scene = list(
      aspectmode = 'manual',  # Set to manual for custom aspect ratio
      aspectratio = list(x = 1, y = 1, z = 0.5),  # Adjust x, y, z ratios
      
      xaxis = list(
        title = "signal variance"
      ),
      
      yaxis = list(
        title = "length-scale parameter"
      ),
      
      zaxis = list(
        title = "-log(marginal likelihood)"
      )
    )
  )
  
fig

Plot the surface nearest the optimum value.

fig <- plot_ly() %>% 
  
  add_trace(
    x = unique(surface_plot$sigma_f)[80:121], 
    y = unique(surface_plot$ell)[10:51], 
    z = matrix(surface_plot$energy, nrow = 51, ncol = 121)[10:51, 80:121],
    type = "surface") %>% 
  
  add_trace(
    x = 6.1,
    y = 1.95,
    z = 11.05812,
    type = "scatter3d",
    mode = "markers"
  ) %>%
  
  layout(
    scene = list(
      aspectmode = 'manual',  # Set to manual for custom aspect ratio
      aspectratio = list(x = 1, y = 1, z = 0.5),  # Adjust x, y, z ratios
      
      xaxis = list(
        title = "signal variance"
      ),
      
      yaxis = list(
        title = "length-scale parameter"
      ),
      
      zaxis = list(
        title = "-log(marginal likelihood)"
      )
    )
  )
  
fig

More points!

Next, increase the number of points that we are fitting, but within the originally specified range of the function \(x \in (-2, 2)\). Below, I have added 3 new points.

# n is 7 now; used to be 5
n_new <- 7
x_new  <- matrix(seq(-2, 2, length.out = n_new), ncol = 1)
y_new  <- x_new^2

Plot the data.

ggplot() + 
  geom_point(aes(x = x_new, y = y_new), size=2) +
  xlab('x') + ylab('y') 

Now, fit the model using gp_optim.

gp_empty_7 <- gp_init(
  
  # A squared exponential (aka Gaussian aka RBF) kernel
  cfs = cf_sexp(
    vars = NULL,
    lscale = 0.3,
    magn = 1,
    prior_lscale = prior_logunif(),
    prior_magn = prior_logunif(),
    normalize = FALSE
  ),  
  
  # Assume Gaussian distributed errors
  lik = lik_gaussian(
    sigma = 0.5, 
    prior_sigma = prior_logunif()
  ), 
  
  # Use the full covariance (i.e., do not approximate)
  method = method_full() 
  
)

Optimize.

gp_optimized_7 <- gp_optim(gp = gp_empty_7,
                           x = x_new,
                           y = y_new,
                           restarts = 5,
                           verbose = TRUE)
## Optimizing parameters
## p1: log cf_sexp.lscale
## p2: log cf_sexp.magn
## p3: log lik_gaussian.sigma
## 
##       p1       p2       p3     Energy Iteration
##    -1.20     0.00    -0.69      21.92         0 
##    -1.08     0.00    -0.69      21.45         1 
##    -1.20     0.12    -0.69      19.95         2 
##    -1.20     0.00    -0.57      21.38         3 
##    -1.12     0.08    -0.61      20.04         4 
##    -1.14     0.06    -0.63      20.47         5 
##    -1.27     0.13    -0.56      19.57         6 
##    -1.36     0.20    -0.49      18.77         7 
##    -1.26     0.27    -0.63      18.15         8 
##    -1.28     0.40    -0.65      17.09         9 
##    -1.44     0.40    -0.61      17.24        10 
##    -1.36     0.32    -0.61      17.81        11 
##    -1.53     0.55    -0.48      16.39        12 
##    -1.69     0.76    -0.37      15.93        13 
##    -1.58     0.84    -0.60      15.91        14 
##    -1.69     1.16    -0.65      16.51        15 
##    -1.59     0.94    -0.47      15.99        16 
##    -1.55     0.80    -0.51      15.90        17 
##    -1.93     1.20    -0.33      16.68        18 
##    -1.77     1.00    -0.41      16.11        19 
##    -1.44     0.60    -0.57      16.20        20 
##    -1.69     0.90    -0.45      15.96        21 
##    -1.53     0.70    -0.53      15.98        22 
##    -1.65     0.85    -0.47      15.92        23 
##    -1.50     0.90    -0.68      15.92        24 
##    -1.55     0.87    -0.60      15.91        25 
##    -1.47     0.82    -0.67      15.88        26 
##    -1.38     0.81    -0.76      15.84        27 
##    -1.46     0.77    -0.64      15.90        28 
##    -1.48     0.79    -0.63      15.89        29 
##    -1.37     0.76    -0.67      15.85        30 
##    -1.42     0.78    -0.65      15.87        31 
##    -1.27     0.77    -0.87      15.77        32 
##    -1.13     0.75    -1.05      15.61        33 
##    -1.11     0.75    -1.02      15.57        34 
##    -0.92     0.74    -1.22      15.23        35 
##    -0.92     0.77    -1.36      15.23        36 
##    -0.70     0.78    -1.70      14.68        37 
##    -0.45     0.71    -1.88      13.94        38 
##     0.02     0.65    -2.44      12.98        39 
##     0.07     0.69    -2.52      12.67        40 
##     0.67     0.66    -3.26      23.11        41 
##     0.51     0.68    -3.22      16.47        42 
##    -0.56     0.72    -1.72      14.28        43 
##     0.38     0.60    -2.76      15.45        44 
##    -0.43     0.73    -1.96      13.84        45 
##     0.33     0.66    -2.90      13.68        46 
##     0.11     0.68    -2.60      12.75        47 
##     0.56     0.62    -3.08      19.80        48 
##    -0.18     0.70    -2.24      13.12        49 
##     0.31     0.65    -2.80      13.75        50 
##    -0.06     0.69    -2.38      12.90        51 
##     0.06     0.72    -2.56      12.53        52 
##     0.08     0.75    -2.62      12.32        53 
##     0.23     0.73    -2.78      12.41        54 
##     0.16     0.72    -2.68      12.44        55 
##     0.14     0.77    -2.68      12.10        56 
##     0.16     0.81    -2.72      11.80        57 
##     0.25     0.84    -2.89      11.58        58 
##     0.34     0.91    -3.07      11.12        59 
##     0.16     0.92    -2.83      11.37        60 
##     0.18     0.87    -2.81      11.50        61 
##     0.36     1.01    -3.12      10.46        62 
##     0.51     1.14    -3.37       9.80        63 
##     0.51     1.17    -3.47       9.61        64 
##     0.68     1.34    -3.84       9.01        65 
##     0.86     1.34    -4.03      11.88        66 
##     0.34     1.03    -3.13      10.42        67 
##     0.68     1.43    -3.82       8.37        68 
##     0.84     1.69    -4.19       7.38        69 
##     1.02     1.75    -4.48       8.62        70 
##     0.85     1.57    -4.14       8.35        71 
##     1.08     1.93    -4.74       7.29        72 
##     1.36     2.33    -5.43       7.25        73 
##     1.35     2.39    -5.33       6.40        74 
##     1.68     2.91    -6.08       6.36        75 
##     1.74     3.05    -6.33       5.62        76 
##     2.19     3.78    -7.42       6.62        77 
##     2.34     3.83    -7.70      13.26        78 
##     1.22     2.23    -5.07       6.18        79 
##     1.73     3.12    -6.22       4.56        80 
##     1.92     3.52    -6.62       3.52        81 
##     1.57     2.95    -5.93       3.85        82 
##     1.60     2.94    -5.97       4.27        83 
##     2.27     4.12    -7.52       3.61        84 
##     2.00     3.65    -6.90       3.66        85 
##     2.09     4.02    -7.05       1.80        86 
##     2.27     4.50    -7.41       0.82        87 
##     2.73     5.14    -8.43       1.79        88 
##     2.44     4.59    -7.80       2.05        89 
##     2.35     4.65    -7.45       0.77        90 
##     2.39     4.92    -7.42       0.36        91 
##     3.02     6.19    -8.89      -0.19        92 
##     3.57     7.52   -10.02      -0.22        93 
##     2.75     6.16    -8.14       0.75        94 
##     2.75     5.90    -8.21       0.18        95 
##     3.53     7.73    -9.70       0.67        96 
##     3.22     6.92    -9.12       0.20        97 
##     3.96     8.65   -10.81       1.04        98 
##     2.78     5.85    -8.27       0.02        99 
##     2.85     5.93    -8.54      -0.06       100 
##     2.94     6.18    -8.69      -0.05       101 
##     3.38     6.97    -9.68      -0.55       102 
##     3.70     7.50   -10.41      -1.06       103 
##     3.96     8.12   -11.05      -0.71       104 
##     3.67     7.55   -10.36      -0.74       105 
##     4.44     9.12   -11.98       0.71       106 
##     3.25     6.73    -9.40      -0.34       107 
##     3.51     6.99   -10.10      -1.11       108 
##     3.49     6.73   -10.13      -1.24       109 
##     3.99     7.79   -11.20      -1.46       110 
##     4.36     8.33   -12.09      -0.99       111 
##     3.79     7.13   -10.81      -1.62       112 
##     3.85     6.92   -11.03      -0.68       113 
##     3.81     6.94   -11.01      -1.11       114 
##     3.78     7.08   -10.86      -1.57       115 
##     4.22     7.94   -11.78      -1.15       116 
##     3.67     7.03   -10.54      -1.59       117 
##     3.50     6.37   -10.28      -0.76       118 
##     3.87     7.44   -10.97      -1.63       119 
##     3.77     7.32   -10.68      -1.59       120 
##     3.77     7.26   -10.73      -1.64       121 
##     3.95     7.52   -11.13      -1.59       122 
##     3.88     7.40   -10.98      -1.64       123 
##     3.89     7.60   -10.98      -1.54       124 
##     3.81     7.25   -10.85      -1.65       125 
##     3.77     7.17   -10.74      -1.65       126 
##     3.80     7.24   -10.80      -1.66       127 
##     3.71     7.10   -10.60      -1.62       128 
##     3.84     7.32   -10.89      -1.65       129 
##     3.86     7.28   -10.96      -1.61       130 
##     3.79     7.27   -10.79      -1.65       131 
##     3.81     7.30   -10.80      -1.65       132 
##     3.81     7.26   -10.84      -1.66       133 
##     3.77     7.19   -10.73      -1.65       134 
##     3.82     7.29   -10.85      -1.66       135 
## Assessing convergence...
##     3.90     7.24   -10.80      -1.42       136 
##     3.70     7.24   -10.80      -1.50       137 
##     3.80     7.34   -10.80      -1.63       138 
##     3.80     7.14   -10.80      -1.60       139 
##     3.80     7.24   -10.70      -1.66       140 
##     3.80     7.24   -10.90      -1.66       141 
## Optimization converged.

Optimal value of \(\sigma_f^2 = 7.24\) and optimal value of \(\ell=3.8\).

Pull the energy.

gp_energy(gp_optimized_7)
## [1] -1.65625

Plot the optimal model.

xt_more <- seq(-10, 10, len=150)
pred_7 <- gp_pred(gp = gp_optimized_7,
                  xnew = xt_more,
                  var = TRUE)

mu_pred_7 <- pred_7$mean
lb_pred_7 <- pred_7$mean - 2*sqrt(pred_7$var)
ub_pred_7 <- pred_7$mean + 2*sqrt(pred_7$var)

ggplot() + 
  geom_ribbon(aes(x=xt_more, ymin=lb_pred_7, ymax=ub_pred_7), fill='lightgray') +
  geom_line(aes(x=xt_more, y=mu_pred_7), linewidth = 0.5) +
  geom_point(aes(x=x_new, y=y_new), size=2) +
  xlab('x') + ylab('y')

Generate the surface

Recall, optimal value of \(\sigma_f^2 = 7.24\) and optimal value of \(\ell=3.8\). We will set \(\sigma_n^2 = 0.01\).

# using prior optimization, set sequence
ell <- seq(1, 6, by = 0.05)

# using prior optimization, set sequence
sigma_f <- seq(4, 10, by = 0.05)

hyperparams <- expand.grid(
  ell = ell,
  sigma_f = sigma_f
)

# collect energies in the empty vector
energy <- vector(mode = "numeric", length = dim(hyperparams)[1])

# run 6000+ models
for (i in 1:dim(hyperparams)[1]) {
  
  gp_empty <- gp_init(
    # A squared exponential (aka Gaussian aka RBF) kernel
    cfs = cf_sexp(
      vars = NULL,
      lscale = hyperparams$ell[i],
      magn = hyperparams$sigma_f[i],
      prior_lscale = prior_logunif(),
      prior_magn = prior_logunif(),
      normalize = FALSE
    ),  
    
    # Assume Gaussian distributed errors
    lik = lik_gaussian(
      sigma = 0.01, 
      prior_sigma = prior_logunif()
    ), 
    
    # Use the full covariance (i.e., do not approximate)
    method = method_full() 
  )
  
  gp_raw <- gp_fit(gp_empty, x_new, y_new)
  
  energy[i] <- gp_energy(gp_raw)
  
}

surface_plot <- cbind(hyperparams, energy)

surface_plot[which.min(surface_plot$energy),]
##       ell sigma_f   energy
## 12159 2.9      10 5.433649

Is the range of energies lower value?

range(surface_plot$energy)
## [1]  5.433649 97.868111

Plot the surface.

fig <- plot_ly() %>% 
  
  add_trace(
    x = unique(surface_plot$sigma_f), 
    y = unique(surface_plot$ell), 
    z = matrix(surface_plot$energy, nrow = 101, ncol = 121),
    type = "surface") %>% 
  
  add_trace(
    x = 10,
    y = 2.9,
    z = 5.433,
    type = "scatter3d",
    mode = "markers"
  ) %>%
  
  layout(
    scene = list(
      aspectmode = 'manual',  # Set to manual for custom aspect ratio
      aspectratio = list(x = 1, y = 1, z = 0.5),  # Adjust x, y, z ratios
      
      xaxis = list(
        title = "signal variance"
      ),
      
      yaxis = list(
        title = "length-scale parameter"
      ),
      
      zaxis = list(
        title = "-log(marginal likelihood)"
      )
    )
  )
  
fig

Plot the surface with lowest energies.

fig <- plot_ly() %>% 
  
  add_trace(
    x = unique(surface_plot$sigma_f), 
    y = unique(surface_plot$ell)[1:45], 
    z = matrix(surface_plot$energy, nrow = 101, ncol = 121)[1:45, ],
    type = "surface") %>% 
  
  add_trace(
    x = 10,
    y = 2.9,
    z = 5.433,
    type = "scatter3d",
    mode = "markers"
  ) %>%
  
  layout(
    scene = list(
      aspectmode = 'manual',  # Set to manual for custom aspect ratio
      aspectratio = list(x = 1, y = 1, z = 0.5),  # Adjust x, y, z ratios
      
      xaxis = list(
        title = "signal variance"
      ),
      
      yaxis = list(
        title = "length-scale parameter"
      ),
      
      zaxis = list(
        title = "-log(marginal likelihood)"
      )
    )
  )
  
fig

Broken optimization

\(n=7\) works, but \(n=8\) or \(n=9\) does not.

# n is 8 now
n_new <- 8
x_new  <- matrix(seq(-2, 2, length.out = n_new), ncol = 1)
y_new  <- x_new^2

Plot the data.

ggplot() + 
  geom_point(aes(x = x_new, y = y_new), size=2) +
  xlab('x') + ylab('y') 

Now, fit the model using gp_optim.

gp_empty_8 <- gp_init(
  
  # A squared exponential (aka Gaussian aka RBF) kernel
  cfs = cf_sexp(
    vars = NULL,
    lscale = 0.3,
    magn = 1,
    prior_lscale = prior_logunif(),
    prior_magn = prior_logunif(),
    normalize = FALSE
  ),  
  
  # Assume Gaussian distributed errors
  lik = lik_gaussian(
    sigma = 0.5, 
    prior_sigma = prior_logunif()
  ), 
  
  # Use the full covariance (i.e., do not approximate)
  method = method_full() 
  
)

Optimize.

gp_optimized_8 <- gp_optim(gp = gp_empty_8,
                           x = x_new,
                           y = y_new,
                           restarts = 5,
                           verbose = FALSE)
## Error in chol.default(B): the leading minor of order 4 is not positive